{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6e4be20c-3c5d-48ca-9262-a35b96b42938",
   "metadata": {},
   "source": [
    "# L5: Extracting Tables"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6eedf3f1-dcc5-4015-86eb-2c74305da507",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a32702ef-4d4d-44c0-a369-793acee34a43",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Warning control\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cebad2f-8a8c-411d-a103-0ceb9f9a0a3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured_client import UnstructuredClient\n",
    "from unstructured_client.models import shared\n",
    "from unstructured_client.models.errors import SDKError\n",
    "\n",
    "from unstructured.staging.base import dict_to_elements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "301ec997-973a-4b31-b02c-3fb018795af4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Utils import Utils\n",
    "utils = Utils()\n",
    "\n",
    "DLAI_API_KEY = utils.get_dlai_api_key()\n",
    "DLAI_API_URL = utils.get_dlai_url()\n",
    "\n",
    "s = UnstructuredClient(\n",
    "    api_key_auth=DLAI_API_KEY,\n",
    "    server_url=DLAI_API_URL,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b0feddb-a822-4900-a666-437520049c5e",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Access Utils File and Helper Functions:</b> To access helper functions and other related files for this notebook, 1) click on the <em>\"View\"</em> option on the top menu of the notebook and then 2) click on <em>\"File Browser\"</em>. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b33fea6f-513a-4864-bcfc-f9970a57994b",
   "metadata": {},
   "source": [
    "## Example Document: Embedded Images and Tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f5f47c7-8324-4695-9f71-43bffc4aa1f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "Image(filename=\"images/embedded-images-tables.jpg\", height=600, width=600) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9180b5b3-3bdc-4fb1-8112-1551ee96a251",
   "metadata": {},
   "source": [
    "## Process the Document and Extract Tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "868ca1d7-d093-482e-90cf-25207e882285",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/embedded-images-tables.pdf\"\n",
    "\n",
    "with open(filename, \"rb\") as f:\n",
    "    files=shared.Files(\n",
    "        content=f.read(),\n",
    "        file_name=filename,\n",
    "    )\n",
    "\n",
    "req = shared.PartitionParameters(\n",
    "    files=files,\n",
    "    strategy=\"hi_res\",\n",
    "    hi_res_model_name=\"yolox\",\n",
    "    skip_infer_table_types=[],\n",
    "    pdf_infer_table_structure=True,\n",
    ")\n",
    "\n",
    "try:\n",
    "    resp = s.general.partition(req)\n",
    "    elements = dict_to_elements(resp.elements)\n",
    "except SDKError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0ab8092-9111-4b5b-b4fc-0c4a5275eed4",
   "metadata": {},
   "outputs": [],
   "source": [
    "tables = [el for el in elements if el.category == \"Table\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d986ccf-c1d0-4b1a-bda8-2d240a99278e",
   "metadata": {},
   "outputs": [],
   "source": [
    "tables[0].text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "162a4f56-bfc8-444e-a97a-8a05770e12cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_html = tables[0].metadata.text_as_html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81c8ee3b-94b9-4e80-9d7e-b82bf18656ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "from io import StringIO \n",
    "from lxml import etree\n",
    "\n",
    "parser = etree.XMLParser(remove_blank_text=True)\n",
    "file_obj = StringIO(table_html)\n",
    "tree = etree.parse(file_obj, parser)\n",
    "print(etree.tostring(tree, pretty_print=True).decode())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21a5c596-a836-4877-a539-16bfc312744a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.display import HTML\n",
    "HTML(table_html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "282f303e-3c17-4161-8f32-b55e4951fd70",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import ChatOpenAI\n",
    "from langchain_core.documents import Document\n",
    "from langchain.chains.summarize import load_summarize_chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ec5ed63-a143-417b-b163-69122e2a8006",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-1106\")\n",
    "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n",
    "chain.invoke([Document(page_content=table_html)])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0201f43-e0cb-4ed5-b0c2-bf4519a8324b",
   "metadata": {},
   "source": [
    "## Work With Your Own Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "157d8aac-6d3b-40c4-a1bc-77531e56c164",
   "metadata": {},
   "outputs": [],
   "source": [
    "import panel as pn\n",
    "#import param\n",
    "from Utils import upld_file\n",
    "pn.extension()\n",
    "\n",
    "upld_widget = upld_file()\n",
    "pn.Row(upld_widget.widget_file_upload)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c32b416-0c1b-41dc-9021-daf19732d6a8",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> 🖥 &nbsp; <b>Note:</b> If the file upload interface isn't functioning properly, the issue may be related to your browser version. In such a case, please ensure your browser is updated to the latest version, or try using a different browser.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f87b9f75-4261-408f-8cc5-05544a545038",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls ./example_files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09f61595-f22b-434f-95b4-8136e3634cca",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Uploading Your Own File - Method 2:</b> To upload your own files, you can also 1) click on the <em>\"View\"</em> option on the top menu of the notebook and then 2) click on <em>\"File Browser\"</em>. Then 3) click on <em>\"Upload\"</em> button to upload your files. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d00743fc-fd4a-499f-b349-951a24d357e5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6512138c-afb0-42f8-81cc-cfc9a5535185",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e47b9e6-4946-495d-ad36-18a06e1a811c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfd42be0-b1d0-4a01-8bde-8b2039ca7a9a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23949833-9724-4882-984e-47b6866f01f1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b6d4e54-bd45-4276-8181-e2b5b0f3f429",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb632547-2723-465a-8cfe-1ef2f3782104",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c76ef8ef-36e8-412c-8fc2-bde5aa00e594",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
